library(GO.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
## tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
library(glue)
##
## Attaching package: 'glue'
## The following object is masked from 'package:IRanges':
##
## trim
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse() masks IRanges::collapse()
## ✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ lubridate::second() masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::select() masks AnnotationDbi::select()
## ✖ dplyr::slice() masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
if (str_detect(getwd(), "Bio_SDD")) {
wd <- "/home/shannc/Bio_SDD/MUIC_senior_project/workflow"
env <- "/home/shannc/Bio_SDD/miniconda3/envs/reticulate"
} else {
wd <- "/home/shannc/workflow"
env <- "/home/shannc/anaconda3/envs/reticulate"
}
go_file <- glue("{wd}/data/reference/go_data.tsv")
if (file.exists(go_file)) {
info_tb <- read_tsv(go_file)
} else {
all_gos <- as.list(GOTERM) %>% names()
info_tb <- goInfoTb(all_gos)
write_tsv(info_tb, go_file)
}
## Rows: 42443 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): GO_IDs, term, definition, ontology
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
showFrequent <- function(tb, core_word = NULL, filter_unwanted = TRUE) {
if (!is.null(core_word)) {
tb <- tb %>%
filter(grepl(core_word, term))
}
tb <- tb %>%
unnest_tokens(word, term) %>%
count(word, sort = TRUE)
if (filter_unwanted) {
tb %>% filter(!word %in% UNWANTED)
} else {
tb
}
}
Find the common verbs and qualifiers used in the GO
# Nouns
UNWANTED <- c(
"of", "to", "cell", "in", "complex", "activity", "regulation", "process", "cellular", "stimulus",
"response"
)
qualifiers <- c("positive", "negative", "catabolic", "involved")